# Import transcripts ----------------------------------------------------------------------------------------------------------------
transcripts_data <- "data/confidential-data-not-for-publication/raw/transcripts"

library(readxl)

transcripts_dir <- list.files(transcripts_data, full.names = TRUE) %>%
  str_subset("~\\$", negate = TRUE) %>%
  str_subset("IDs", negate = TRUE)

# List all the sheets in each Excel file
list_of_sheets <- transcripts_dir %>%
  map(~ excel_sheets(.x)) %>%
  map(~ str_subset(.x, "template|Copy of", negate = TRUE))

# Get list of transcript dir with lengths that match the number of sheets
transcripts_dir_rep <- transcripts_dir %>%
  map2(list_of_sheets, ~ rep(.x, length(.y)))

# Import each sheet into a tibble, for each excel file and sheet combo
list_of_tibbles <- tibble(
  transcripts_dir_rep = transcripts_dir_rep %>% unlist(),
  sheet = list_of_sheets %>% unlist()
) %>%
  mutate(
    data = map2(transcripts_dir_rep, sheet, ~ read_excel(.x, sheet = .y))
  )

# Clean transcript data ----------------------------------------------------------------------------------------------------------------

list_of_tibbles_clean <- list_of_tibbles %>%

  # Clean IDs
  mutate(id_clean = sheet %>% str_squish %>% str_replace_all("\\s", "")) %>%
  filter(id_clean != "01F002") %>%
  mutate(
    id_clean = case_when(
      id_clean == "01F326" & str_detect(transcripts_dir_rep, "Madhavan") ~ "01F326_Madhavan",
      id_clean == "01F326" & str_detect(transcripts_dir_rep, "Sathya") ~ "01F326_Sathya",
      TRUE ~ id_clean
    ),

    id_clean = case_when(
      id_clean == "01M005" & str_detect(transcripts_dir_rep, "Madhavan") ~ "01M005_Madhavan",
      id_clean == "01M005" & str_detect(transcripts_dir_rep, "Sathya") ~ "01M005_Sathya",
      TRUE ~ id_clean
    )
  ) %>%
  dups_report(id_clean) %>%

  #   Clean column names
  rowwise() %>%
  mutate(
    data_clean = list(data %>% janitor::clean_names()),
    data_clean = list(
      data_clean %>% mutate(across(
        c(pair_id_which_pair_of_options_are_they_discussing, timestamp_when_in_the_video_does_this_line_of_speech_start_write_in_the_format_minutes_seconds_e_g_04_22),
        as.character
      ))
    )
  ) %>%
  unnest(data_clean) %>%
  rename(
    who_speaking = who_is_speaking_lead_r1_r2_or_r3,
    pair_id = pair_id_which_pair_of_options_are_they_discussing,
    timestamp = timestamp_when_in_the_video_does_this_line_of_speech_start_write_in_the_format_minutes_seconds_e_g_04_22,
    speech_tamil = speech_in_tamil_write_down_exactly_what_they_are_saying_not_just_a_summary,
    speech_english = speech_english_translated,
    audible = audible_is_the_speech_audible,
    notes = notes_any_notes_on_the_manner_of_speaking_who_it_is_being_addressed_to_uncertainty_about_what_is_being_said_etc,
    translation = translation,
    pro_a_b = argument_in_favour_of,
    notes_on_reasons = notes_on_reasons_e_g_other_reasons
  )

list_of_tibbles_clean %>% print_names

list_of_tibbles_clean %>% filter(!is.na(type_of_reason_4)) %>% count_prop(transcripts_dir_rep, sheet, id_clean, type_of_reason_4)

# NEED TO CLEAN "EXTRA" COLUMN NAMES
d_transcripts <- list_of_tibbles_clean %>%
  select(-data) %>%
  bind_rows() %>%
  rename(notes_general = notes_any_notes_on_the_manner_of_speaking_who_it_is_being_addressed_to_etc) %>%
  count_prop(who_speaking) %>%
  mutate(
    who_speaking = case_when(
      who_speaking == "0ther" ~ "Other",
      who_speaking == "Others" ~ "Other",
      who_speaking == "other" ~ "Other",
      who_speaking == "Any respondents" ~ "R (unclear)",
      TRUE ~ who_speaking
    )
  ) %>%
  count_prop(who_speaking) %>%

  count_prop(pro_a_b) %>%
  mutate(
    pair_id = ifelse(pair_id == "NA", NA, pair_id)
  ) %>%

  count_prop(type_of_reason_1) %>%

  #   DEAL WITH NAs
  filter(!(is.na(who_speaking) & is.na(notes) & is.na(speech_tamil))) %>%
  count_prop(who_speaking)

# Match with R1 data ----------------------------------------------------------------------------------------------------------------

r1_choices_by_group <- r1_choices %>%
  select(
    group_id, discuss_type, stratum_id, discuss_type_label, arm_label, round,
    matches("r1_.*_(1|2)"), pair_includes_trans, r1_choose_comparator,
    quality_diff, educ_diff, english_diff, quality_included, educ_included, english_included,
    comparator_order_in_pair
  ) %>%
  mutate(across(where(is.numeric), as.integer)) %>%
  filter(discuss_type == "discussion_full") %>%
  group_by(group_id, round) %>%
  summarise(across(everything(), first_non_na)) %>%
  ungroup %>%
  dups_report(group_id, round) %>%
  arrange(group_id, round)

any_na <- function(x) {
  x_non_na <- x[!is.na(x)]
  if (length(x_non_na) == 0) {
    return(NA)
  } else {
    return(any(x_non_na))
  }
}

# MATCH WITH r1 data
transcripts_with_r1 <- left_join(
  d_transcripts %>% mutate(pair_id = as.integer(pair_id)),
  r1_choices_by_group,
  by = c("id_clean" = "group_id", "pair_id" = "round")
) %>%
  select(-group_id) %>%
  rename(group_id = id_clean) %>%
  group_by(group_id) %>%
  mutate(transcript_line_id = row_number()) %>%
  ungroup %>%

  mutate(
    comparator_a_b = case_when(
      comparator_order_in_pair == 1 ~ "A",
      comparator_order_in_pair == 2 ~ "B"
    )
  ) %>%
  mutate(
    pro_comparator_arg = case_when(
      pro_a_b == "A" & comparator_a_b == "A" ~ TRUE,
      pro_a_b == "B" & comparator_a_b == "B" ~ TRUE,
      pro_a_b == "A" & comparator_a_b == "B" ~ FALSE,
      pro_a_b == "B" & comparator_a_b == "A" ~ FALSE
    )
  ) %>%
  mutate(
    pro_trans_arg = ifelse(pair_includes_trans, pro_comparator_arg, NA)
  ) %>%

  count_prop(pro_a_b) %>%
  group_by(group_id, pair_id) %>%
  mutate(
    disagreement = any_na(pro_a_b == "A") & any_na(pro_a_b == "B"),
    starts_pro_trans = first_non_na(pro_trans_arg == TRUE)
  ) %>%
  ungroup %>%
  count_prop(disagreement) %>%
  count_prop(starts_pro_trans) %>%
  relocate(group_id, transcript_line_id)

transcripts_with_r1 %>% view()


# Export for translation ----------------------------------------------------------------------------------------------------------------

transcripts_for_translation <- transcripts_with_r1 %>%
  count_print(who_speaking) %>%
  left_join(
    tibble(
      who_speaking = c("Lead", "R (unclear)", "R1", "R2", "R3", "Other"),
      who_speaking_label = c("Facilitator", "Resp. (unknown)", "Resp. 1", "Resp. 2", "Resp. 3", "Other (e.g. bystander)")
    )
  ) %>%
  mutate(speech_with_labelling = str_glue(
    "[Line: {transcript_line_id}, Speaker: {who_speaking_label}] {speech_tamil}"
  )) %>%

  # ADD PAIR ID label when it first appears
  # add indicator for when each pair_id first appears:
  group_by(group_id) %>%
  mutate(
    first_line_of_pair = ifelse(duplicated(pair_id), NA, pair_id)
  ) %>%
  mutate(
    first_line_of_pair_label = ifelse(!is.na(first_line_of_pair), str_glue("\n\n ** Choice {first_line_of_pair} ** \n\n"), NA),
    speech_with_labelling = ifelse(
      !is.na(first_line_of_pair_label),
      str_c(first_line_of_pair_label, speech_with_labelling),
      speech_with_labelling
    )
  ) %>%
  summarise(
    speech_for_translation = str_c(speech_with_labelling, collapse = "\n")
  )

transcripts_for_translation %>% write_excel_csv(
  "data/confidential-data-not-for-publication/cleaned/transcripts_for_translation.csv"
)


#translations <- read_csv("data/confidential-data-not-for-publication/raw/translated_transcripts_v0.csv") %>%
  #mutate(
    #group_id = transcripts_for_translation$group_id[1:nrow(.)],
  #) %>%
  #relocate(group_id) %>%
  #select(-input) %>%
  #separate_rows(output, sep = "\n") %>%
  #filter(str_detect(output, "\\[Line:")) %>%
  #mutate(
    #transcript_line_id = output %>% str_extract("\\d+") %>% as.integer(),
    #who_speaking_label = output %>% str_extract("(?<=Speaker: ).*(?=\\])"),
    #speech_english = output %>% str_extract("(?<=\\] ).*")
  #)

transcripts_with_r1

#transcripts_with_translation <- transcripts_with_r1 %>%
  #select(-speech_english, -translation) %>%
  #inner_join(
    #translations,
    #by = c("group_id", "transcript_line_id"),
    #suffix = c("", "_translation_df")
  #) %>%
  #count_prop(who_speaking, who_speaking_label)



# Import translations ----------------------------------------------------------------------------------------------------------------

# Import all .txt files in the data/transcript_translations folder
translations_dir <- "data/confidential-data-not-for-publication/raw/transcript_translations"

# Clean translations data before merging
translations_unmerged <- tibble(
  file = list.files(translations_dir, full.names = TRUE) %>% str_subset(".txt$")
) %>%
  mutate(translation_id = str_extract(file, "\\d+") %>% as.numeric()) %>%
  arrange(translation_id) %>%
  mutate(group_id = transcripts_for_translation$group_id) %>%

  # Import file:
  mutate(
    translation_text_all = file %>% map_chr(read_file),
    speech_english = translation_text_all %>%
      map(~ str_split(.x, "\n+") %>% .[[1]])
  ) %>%
  unnest(speech_english) %>%
  filter(!str_detect(speech_english, "\\*+.*Choice \\d")) %>%
  mutate(speech_english = speech_english %>%
    str_replace_all("My line\\:|My line\\: Line|My Line\\:|My line", "Line:") %>%
    str_replace_all("Respued \\(unknown\\)", "Resp. (unknown)") %>%
    str_replace_all("Response (\\d)", "Resp. \\1")
  ) %>%
  mutate(
    transcript_line_id = str_match(speech_english, "\\[Line\\: (\\d+)")[, 2] %>% as.integer,
    who_speaking_label = str_match(speech_english, "(Resp\\. \\d|Fas?cili\\w*|Resp\\. \\(unknown\\)|Other( \\(e\\.g\\. bystander\\))?)") %>% .[, 1],
  ) %>%
  mutate(
    who_speaking_label = who_speaking_label %>%  str_replace("Fas?cili\\w*", "Facilitator") %>%
      str_replace("Other$", "Other (e.g. bystander)")
  ) %>%
  left_join(
    tibble(
      who_speaking = c("Lead", "R (unclear)", "R1", "R2", "R3", "Other"),
      who_speaking_label = c("Facilitator", "Resp. (unknown)", "Resp. 1", "Resp. 2", "Resp. 3", "Other (e.g. bystander)")
    )
  )


# Check 1: the number of obs in each file
n_lines_translations <- translations_unmerged %>% group_by(group_id, translation_id) %>% summarise(n_lines_tr = n_distinct(transcript_line_id))
n_lines_original <- transcripts_with_r1 %>% group_by(group_id) %>% summarise(n_lines_original = n_distinct(transcript_line_id))

n_lines_merged <- full_join(
  n_lines_translations,
  n_lines_original,
  by = "group_id"
) %>%
  filter(n_lines_tr != n_lines_original) %>%
  mutate(diff = n_lines_tr - n_lines_original) %>%
  arrange(desc(diff)) %>%
  print_all

# Check 2: make sure [ ] format is correct
translations_unmerged %>%
  select(translation_id, group_id, transcript_line_id, who_speaking, speech_english) %>%
  filter(!str_detect(speech_english, "\\[.*\\]")) %>%
  print

# Merge translations with original data ----------------------------------------------------------------------------------------------------------------

translations_merged <- translations_unmerged %>%
  full_join(
    transcripts_with_r1 %>% select(-speech_english),
    by = c("group_id", "transcript_line_id"),
    suffix = c("_tr", "")
  ) %>%
  arrange(group_id, transcript_line_id) %>%
  mutate(speech_english = speech_english %>% str_replace("\\[.*\\]", "") %>% str_trim()) %>%
  rowwise() %>%
  mutate(
    reasons_joined = list(
      c(type_of_reason_1, type_of_reason_2, type_of_reason_3, type_of_reason_4)
    )
  ) %>%
  ungroup %>%
  mutate(
    # Remove NAs, and if length is 0, just return NA
    reasons_list = reasons_joined %>% map(~ .x[!is.na(.x)]),
    reasons_joined = reasons_list %>% map_chr(~ ifelse(length(.x) == 0, NA, str_c(.x, collapse = "; ")))
  ) %>%
  mutate(
    pro_social = str_detect(reasons_joined, "Respect for person / people|Equality / justice|Want to give opportunity to this person or to help them")
  ) %>%
  count_prop(pro_social) %>%
  mutate(across(c(speech_tamil, speech_english), ~ ifelse(str_detect(.x, regex("^(#?I?INAUDIBLE#?|IINUDIBLE|NNAUDIBLE|NA|\\s+)$", ignore_case = TRUE)), NA, .x))) %>%
  mutate(across(c(speech_tamil, speech_english), ~ .x %>% str_replace_all('"', "")))


# Export missing lines, make sure it works in Excel with Tamil unicode
translations_merged %>%
  select(translation_id, group_id, transcript_line_id, who_speaking, who_speaking_tr, speech_tamil, speech_english) %>%
  filter(!is.na(speech_tamil) & is.na(speech_english)) %>%
  write_excel_csv("data/confidential-data-not-for-publication/cleaned/transcripts_for_translation_fill_in.csv")

# IMPORT back in missing lines
missing_lines_import <- read_csv("data/confidential-data-not-for-publication/cleaned/transcripts_for_translation_filled.csv") %>%
  select(group_id, transcript_line_id, speech_english = output)

# Merge in missing lines
translations_filled <- translations_merged %>%
  left_join(
    missing_lines_import,
    by = c("group_id", "transcript_line_id"),
    suffix = c("", "_filled")
  ) %>%
  mutate(
    speech_english = ifelse(is.na(speech_english), speech_english_filled, speech_english)
  ) %>%
  select(-speech_english_filled)


translations_merged %>%
  #   Check 1: who_speaking
  select(translation_id, group_id, transcript_line_id, who_speaking, who_speaking_tr, speech_tamil, speech_english)


transcripts_by_round <- translations_filled %>%
  group_by(group_id, pair_id, stratum_id) %>%
  filter(!is.na(pair_id)) %>%
  summarise(
    across(
      c(disagreement, pair_includes_trans, r1_choose_comparator, starts_pro_trans),
      ~ first_non_na(.x)
    )
  ) %>%
  mutate(r1_choose_comparator = as.logical(r1_choose_comparator))


translations_filled


# Calculate discussion length ----------------------------------------------------------------------------------------------------------------
transcript_discussion_length <- translations_filled %>%
  filter(!is.na(speech_english), str_detect(who_speaking_tr, "^R")) %>%
  group_by(group_id, pair_includes_trans) %>%
  summarise(
    discussion_length = n(),
    discussion_length_pro_anti = sum_na(!is.na(pro_a_b)),
    discussion_length_characters = sum_na(nchar(speech_english)),
    discussion_length_words = sum_na(str_count(speech_english, "\\w+\\s*")),
    discussion_length_words_pro_anti = sum_na(str_count(speech_english[!is.na(pro_a_b)], "\\w+\\s*"))
  ) %>%
  ungroup %>%
  mutate(pair_includes_trans = ifelse(pair_includes_trans == 1, "trans", "non_trans")) %>%
  filter(!is.na(pair_includes_trans)) %>%
  pivot_wider(names_from = pair_includes_trans, values_from = c(discussion_length, discussion_length_pro_anti, discussion_length_characters, discussion_length_words, discussion_length_words_pro_anti))

transcript_discussion_length %>% glimpse

# Calculate discussion length for facilitators (Lead) ----------------------------------------------------------------------------------------------------------------
transcript_discussion_length_lead <- translations_filled %>%
  filter(!is.na(speech_english), str_detect(who_speaking_tr, "^Lead")) %>%

# Remove the very first line (explanation)
group_by(group_id) %>% 

  group_by(group_id, pair_includes_trans) %>%
  summarise(
    discussion_length_lead = n(),
    discussion_length_pro_anti_lead = sum_na(!is.na(pro_a_b)),
    discussion_length_characters_lead = sum_na(nchar(speech_english)),
    discussion_length_words_lead = sum_na(str_count(speech_english, "\\w+\\s*")),
    discussion_length_words_pro_anti_lead = sum_na(str_count(speech_english[!is.na(pro_a_b)], "\\w+\\s*"))
  ) %>%
  ungroup %>%
  mutate(pair_includes_trans = ifelse(pair_includes_trans == 1, "trans", "non_trans")) %>%
  filter(!is.na(pair_includes_trans)) %>%
  pivot_wider(names_from = pair_includes_trans, values_from = c(discussion_length_lead, discussion_length_pro_anti_lead, discussion_length_characters_lead, discussion_length_words_lead, discussion_length_words_pro_anti_lead))


# Join lead facilitator discussion length with participant discussion length
transcript_discussion_length <- transcript_discussion_length %>%
  left_join(transcript_discussion_length_lead, by = "group_id") %>% 
  # Calculate ratio of lead discussion length to total discussion length
  mutate(
    # For trans pairs
    discussion_ratio_lead_trans = discussion_length_words_lead_trans / (discussion_length_words_lead_trans + discussion_length_words_trans),
    # For non-trans pairs
    discussion_ratio_lead_non_trans = discussion_length_words_lead_non_trans / (discussion_length_words_lead_non_trans + discussion_length_words_non_trans)
  )


translations_filled %>% 
mutate( discussion_length_words_lead = str_count(speech_english, "\\w+\\s*")) %>% 
select(speech_english, who_speaking_tr, matches("words"))


save(translations_filled, transcript_discussion_length, file = "data/cleaned/transcripts_clean.RData")
write_csv(translations_filled, "data/cleaned/transcripts_clean.csv")
